Digit Recognizer

A BEGINNER'S GUIDE

Using

  • Multi-layer Perceptron Model (MLP)
  • Convolutional Neural Network (CNN) Model
  • Keras

Import Libraries


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set() # setting seaborn default for plots

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from keras.utils import np_utils
from keras.datasets import mnist

# for Multi-layer Perceptron (MLP) model
from keras.models import Sequential
from keras.layers import Dense

# for Convolutional Neural Network (CNN) model
from keras.layers import Dropout, Flatten
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D

# fix for issue: https://github.com/fchollet/keras/issues/2681
from keras import backend as K
K.set_image_dim_ordering('th')


Using TensorFlow backend.

Loading Train and Test datasets


In [3]:
train = pd.read_csv('train.csv')
print (train.shape)
train.head()


(42000, 785)
Out[3]:
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 785 columns


In [4]:
test = pd.read_csv('test.csv')
print (test.shape)
test.head()


(28000, 784)
Out[4]:
pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 784 columns


In [5]:
y_train = train['label']
X_train = train.drop(labels=['label'], axis=1)
X_test = test

print (y_train.value_counts())
sns.countplot(y_train)


1    4684
7    4401
3    4351
9    4188
2    4177
6    4137
0    4132
4    4072
8    4063
5    3795
Name: label, dtype: int64
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f88974e8510>

In [6]:
X_train.head()


Out[6]:
pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 784 columns


In [7]:
# check for corrupted images in the datasets
# i.e. check if there are any empty pixel values
print (X_train.isnull().any().sum())
print (X_test.isnull().any().sum())


0
0

Get values of data


In [8]:
X_train = X_train.values.astype('float32') # pixel values of all images in train set
y_train = y_train.values.astype('int32') # labels of all images
X_test = test.values.astype('float32') # pixel values of all images in test set

Viewing shape and content of data


In [9]:
print (X_train.shape)
print (y_train.shape)


(42000, 784)
(42000,)

In [10]:
print (y_train[0])
print (X_train[0])


1
[   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  188.  255.   94.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.  191.  250.  253.   93.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.  123.  248.  253.  167.   10.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.   80.  247.  253.
  208.   13.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   29.  207.  253.  235.   77.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.   54.  209.  253.  253.   88.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.   93.  254.  253.  238.  170.   17.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.   23.  210.  254.
  253.  159.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   16.  209.  253.  254.  240.   81.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.   27.  253.  253.  254.   13.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.   20.  206.  254.  254.  198.
    7.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.  168.
  253.  253.  196.    7.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.   20.  203.  253.  248.   76.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.   22.  188.  253.  245.   93.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.  103.  253.  253.
  191.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   89.  240.  253.  195.   25.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.   15.  220.  253.  253.   80.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.   94.  253.  253.  253.   94.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.   89.
  251.  253.  250.  131.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  214.  218.   95.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.]

Plotting images and their class values


In [11]:
plt.figure(figsize=[20,8])
for i in range(6):
    plt.subplot(1,6,i+1)
    # Here, we reshape the 784 pixels vector values into 28x28 pixels image
    plt.imshow(X_train[i].reshape(28, 28), cmap='gray', interpolation='none')
    plt.title("Class {}".format(y_train[i]))



In [12]:
# fix random seed for reproducibility
random_seed = 7
np.random.seed(random_seed)

Normalizing input values

As we can see above, the pixel values for each image are gray scaled between 0 and 255. We now, normalize those values from 0-255 to 0-1.


In [13]:
# pixel values are gray scale between 0 and 255
# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
X_test = X_test / 255
print (X_train[1])


[ 0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.07058824  0.11764706  0.53725493
  0.53725493  0.75294119  0.33725491  0.28235295  0.00392157  0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.05098039  0.33725491  0.98039216  0.99607843
  0.99607843  0.99607843  0.99607843  0.8509804   0.96470588  0.59215689
  0.1254902   0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.0627451   0.7019608   0.99607843
  0.99607843  0.99607843  0.99607843  0.99607843  0.99607843  0.99607843
  0.99607843  0.99607843  0.90588236  0.21176471  0.05882353  0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.28235295
  0.99607843  0.99607843  0.99607843  0.99607843  0.99607843  0.99607843
  0.99607843  0.99607843  0.99607843  0.99607843  0.99607843  0.99607843
  0.40784314  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.23921569  0.74901962  0.99607843  0.99607843  0.99607843  0.99607843
  0.99607843  0.42745098  0.32549021  0.78039217  0.99607843  0.99607843
  0.99607843  0.99607843  0.95294118  0.33333334  0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.67450982  0.99607843  0.99607843  0.99607843
  0.79215688  0.57647061  0.57647061  0.17647059  0.          0.04313726
  0.11372549  0.78431374  0.99607843  0.99607843  0.99607843  0.67058825
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.00392157  0.68235296
  0.99607843  0.99607843  0.34901962  0.26274511  0.          0.          0.
  0.          0.          0.          0.50196081  0.98823529  0.99607843
  0.99607843  0.83137256  0.29803923  0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.18431373  0.99607843  0.99607843  0.99607843  0.11372549  0.          0.
  0.          0.          0.          0.          0.          0.
  0.32549021  0.99607843  0.99607843  0.99607843  0.60000002  0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.3137255   0.99607843  0.99607843  0.94117647  0.09411765
  0.          0.          0.          0.          0.          0.          0.
  0.          0.09803922  0.94117647  0.99607843  0.99607843  0.60000002
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.25098041  0.99607843  0.99607843
  0.72941178  0.02745098  0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.65098041  0.99607843
  0.99607843  0.87843138  0.04705882  0.          0.          0.          0.
  0.          0.          0.          0.          0.05490196  0.90980393
  0.99607843  0.99607843  0.99607843  0.11372549  0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.29411766  0.99607843  0.99607843  0.99607843  0.06666667  0.          0.
  0.          0.          0.          0.          0.          0.
  0.07058824  0.99607843  0.99607843  0.99607843  0.99607843  0.11372549
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.1882353   0.99607843  0.99607843  0.99607843
  0.06666667  0.          0.          0.          0.          0.          0.
  0.          0.          0.00784314  0.63921571  0.99607843  0.99607843
  0.99607843  0.11372549  0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.1882353   0.99607843
  0.99607843  0.99607843  0.06666667  0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.36862746
  0.99607843  0.99607843  0.99607843  0.78431374  0.04705882  0.          0.
  0.          0.          0.          0.          0.          0.0627451
  0.81960785  0.99607843  0.99607843  0.58823532  0.00392157  0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.05882353  0.80784315  0.99607843  0.99607843  0.99607843  0.79215688
  0.25882354  0.          0.          0.          0.          0.
  0.08235294  0.63137257  0.99607843  0.99607843  0.96078432  0.12156863
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.23529412  0.83137256
  0.99607843  0.99607843  0.99607843  0.76078433  0.1882353   0.1882353
  0.13333334  0.16078432  0.1882353   0.81960785  0.99607843  0.99607843
  0.99607843  0.67058825  0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.33725491  0.95294118  0.99607843  0.99607843  0.99607843
  0.99607843  0.99607843  0.9137255   0.95294118  0.99607843  0.99607843
  0.99607843  0.99607843  0.99607843  0.33725491  0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.44705883  0.99607843
  0.99607843  0.99607843  0.99607843  0.99607843  0.99607843  0.99607843
  0.99607843  0.99607843  0.99607843  0.93725491  0.33725491  0.04313726
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.05098039  0.71372551  0.99607843  0.99607843  0.99607843  0.99607843
  0.99607843  0.99607843  0.99607843  0.99607843  0.95294118  0.27450982
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.03137255  0.29803923  0.57254905
  0.99607843  1.          0.99607843  1.          0.57254905  0.07450981
  0.05882353  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]

Converting target variable values into one-hot format

The output/target variable is in the format 0 to 9. As this is a multi-class classification problem, we convert the output class values into one-hot format which is simply a binary matrix, i.e.

value 0 will be converted to one-hot format as [1, 0, 0, 0, 0, 0, 0, 0, 0]

value 1 will be converted to one-hot format as [0, 1, 0, 0, 0, 0, 0, 0, 0]

value 2 will be converted to one-hot format as [0, 0, 1, 0, 0, 0, 0, 0, 0]

and so on...


In [14]:
print (y_train.shape)
print (y_train[0])


(42000,)
1

In [15]:
# one hot encode outputs
# note that we have new variables with capital Y
# Y_train is different than y_train
Y_train = np_utils.to_categorical(y_train)
num_classes = Y_train.shape[1]

In [16]:
print (y_train.shape, Y_train.shape)
print (y_train[0], Y_train[0])


((42000,), (42000, 10))
(1, array([ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]))

Splitting train dataset into training and validation set

We split the train dataset into two parts in 9:1 ratio. 90% will be the actual training set and the remaining 10% will be the validation/testing set. We train our model using the training set and test the accuracy of the model using the validation set.


In [17]:
# Split the entire training set into two separate sets: Training set and Validation set
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.10, random_state=random_seed)

In [18]:
print (X_train.shape, Y_train.shape, X_val.shape, Y_val.shape)
num_pixels = X_train.shape[1]


((37800, 784), (37800, 10), (4200, 784), (4200, 10))

In [19]:
print (Y_val)
# converting one-hot format of digits to normal values/labels
y_val = np.argmax(Y_val, 1) # reverse of to_categorical
print (y_val)
# Note that: capital Y_val contains values in one-hot format and small y_val contains normal digit values


[[ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]]
[1 1 4 ..., 0 2 2]

Define Simple Perceptron Model

Generally, neural networks have the following properties:

  • an input layer as a single vector
  • zero or multiple hidden layers after input layer
  • an output layer after hidden layers which represents class scores in classification problem
  • each neuron in a hidden layer is fully connected to all neurons in the previous layer
  • neurons in a single layer function independently and do not have any connection with other neurons of the same layer

A single-layer perceptron model is the simplest kind of neural network where there are only two layers: input layer and output layer. The inputs are directly fed into the outputs via a series of weights. It's a feed-forward network where the information moves in only one direction, i.e. forward direction from input nodes to output nodes.

A multi-layer perceptron model is the other kind of neural network where there are one or more hidden layers in between input and output layers. The information flows from input layer to hidden layers and then to output layers. These models can be of feed-forward type or they can also use back-propagation method. In back-propagation, the error is calculated in the output layer by computing the difference of actual output and predicted output. The error is then distributed back to the network layers. Based on this error, the algorithm will adjust the weights of each connection in order to reduce the error value. This type of learning is also referred as deep learning.

We create a simple neural network model with one hidden layer with 784 neurons. Our input layer will also have 784 neurons as we have flattened out training dataset into a single 784 dimensional vector.

softmax activation is used in the output layer.

adam gradient descent optimizer is used to learn weights.


In [20]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation='relu'))
    model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

Fit and Evaluate Model

The model is fit over 5 epochs/iteration. It takes a batch of 200 images in each iteration. Validation dataset is used for validation. The epochs may be increased to improve accuracy.

Finally, validation dataset is used to evaluate the model by calculating the model's classification accuracy.


In [21]:
model = baseline_model()
model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=5, batch_size=200, verbose=1)


Train on 37800 samples, validate on 4200 samples
Epoch 1/5
37800/37800 [==============================] - 29s - loss: 0.3345 - acc: 0.9056 - val_loss: 0.1730 - val_acc: 0.9488
Epoch 2/5
37800/37800 [==============================] - 16s - loss: 0.1388 - acc: 0.9603 - val_loss: 0.1234 - val_acc: 0.9657
Epoch 3/5
37800/37800 [==============================] - 17s - loss: 0.0904 - acc: 0.9742 - val_loss: 0.1047 - val_acc: 0.9731
Epoch 4/5
37800/37800 [==============================] - 20s - loss: 0.0631 - acc: 0.9819 - val_loss: 0.0937 - val_acc: 0.9738
Epoch 5/5
37800/37800 [==============================] - 21s - loss: 0.0464 - acc: 0.9871 - val_loss: 0.0827 - val_acc: 0.9795
Out[21]:
<keras.callbacks.History at 0x7f8896f02610>

In [22]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 784)               615440    
_________________________________________________________________
dense_2 (Dense)              (None, 10)                7850      
=================================================================
Total params: 623,290
Trainable params: 623,290
Non-trainable params: 0
_________________________________________________________________

In [23]:
scores = model.evaluate(X_val, Y_val, verbose=0)
print (scores)
print ('Score: {}'.format(scores[0]))
print ('Accuracy: {}'.format(scores[1]))


[0.082673994320489108, 0.97952380952380957]
Score: 0.0826739943205
Accuracy: 0.979523809524

Plot correctly and incorrectly predicted images

Let's plot some images which are correctly predicted and some images which are incorrectly predicted on our validation dataset.


In [24]:
# get predicted values
predicted_classes = model.predict_classes(X_val)


4096/4200 [============================>.] - ETA: 0s

In [25]:
# get index list of all correctly predicted values
correct_indices = np.nonzero(np.equal(predicted_classes, y_val))[0]

# get index list of all incorrectly predicted values
incorrect_indices = np.nonzero(np.not_equal(predicted_classes, y_val))[0]

In [26]:
print ('Correctly predicted: %i' % np.size(correct_indices))
print ('Incorrectly predicted: %i' % np.size(incorrect_indices))


Correctly predicted: 4114
Incorrectly predicted: 86

In [27]:
plt.figure(figsize=[20,8])
for i, correct in enumerate(correct_indices[:6]):
    plt.subplot(1,6,i+1)
    plt.imshow(X_val[correct].reshape(28,28), cmap='gray', interpolation='none')
    plt.title("Predicted {}, Class {}".format(predicted_classes[correct], y_val[correct]))
    
plt.figure(figsize=[20,8])
for i, incorrect in enumerate(incorrect_indices[:6]):
    plt.subplot(1,6,i+1)
    plt.imshow(X_val[incorrect].reshape(28,28), cmap='gray', interpolation='none')
    plt.title("Predicted {}, Class {}".format(predicted_classes[incorrect], y_val[incorrect]))


Confusion Matrix


In [28]:
# we have digit labels from 0 to 9
# we can either manually create a class variable with those labels
# class_names = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# or, we can take unique values from train dataset's labels
class_names = np.unique(y_train)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_val, predicted_classes)
np.set_printoptions(precision=2)

print ('Confusion Matrix in Numbers')
print (cnf_matrix)
print ('')

cnf_matrix_percent = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]

print ('Confusion Matrix in Percentage')
print (cnf_matrix_percent)
print ('')

true_class_names = class_names
predicted_class_names = class_names

df_cnf_matrix = pd.DataFrame(cnf_matrix, 
                             index = true_class_names,
                             columns = predicted_class_names)

df_cnf_matrix_percent = pd.DataFrame(cnf_matrix_percent, 
                                     index = true_class_names,
                                     columns = predicted_class_names)

plt.figure(figsize = (8,6))

#plt.subplot(121)
ax = sns.heatmap(df_cnf_matrix, annot=True, fmt='d')
ax.set_ylabel('True values')
ax.set_xlabel('Predicted values')
ax.set_title('Confusion Matrix in Numbers')

'''
plt.subplot(122)
ax = sns.heatmap(df_cnf_matrix_percent, annot=True)
ax.set_ylabel('True values')
ax.set_xlabel('Predicted values')
'''


Confusion Matrix in Numbers
[[427   0   1   0   0   0   0   0   0   1]
 [  0 495   1   0   0   0   0   1   0   1]
 [  0   2 419   2   1   0   2   6   1   0]
 [  1   0   0 433   0   2   0   4   3   1]
 [  1   0   0   0 389   1   1   3   0   4]
 [  0   0   1   5   1 340   0   1   1   0]
 [  1   0   0   0   3   1 400   0   1   0]
 [  1   1   1   0   0   0   0 434   1   5]
 [  1   3   0   3   1   0   1   0 392   0]
 [  2   1   0   1   4   0   0   5   0 385]]

Confusion Matrix in Percentage
[[ 1.    0.    0.    0.    0.    0.    0.    0.    0.    0.  ]
 [ 0.    0.99  0.    0.    0.    0.    0.    0.    0.    0.  ]
 [ 0.    0.    0.97  0.    0.    0.    0.    0.01  0.    0.  ]
 [ 0.    0.    0.    0.98  0.    0.    0.    0.01  0.01  0.  ]
 [ 0.    0.    0.    0.    0.97  0.    0.    0.01  0.    0.01]
 [ 0.    0.    0.    0.01  0.    0.97  0.    0.    0.    0.  ]
 [ 0.    0.    0.    0.    0.01  0.    0.99  0.    0.    0.  ]
 [ 0.    0.    0.    0.    0.    0.    0.    0.98  0.    0.01]
 [ 0.    0.01  0.    0.01  0.    0.    0.    0.    0.98  0.  ]
 [ 0.01  0.    0.    0.    0.01  0.    0.    0.01  0.    0.97]]

Out[28]:
"\nplt.subplot(122)\nax = sns.heatmap(df_cnf_matrix_percent, annot=True)\nax.set_ylabel('True values')\nax.set_xlabel('Predicted values')\n"

The above confusion matrix heatmap shows that:

  • Most of value 2 were predicted as 7. 6 images of digit 2 were predicted as 7.
  • Similarly, 6 images of digit 9 were predicted as 7.
  • The third highest wrong prediction was of number 5. 5 images of digit 5 were predicted as 3.

The accuracy of the model may improve if we increase the epoch/iteration number while fitting the model. Currently, it is set as 5. We can increase it to 10 and see the accuracy output.

Improve Accuracy using Convolution Neural Network (CNN) Model

Convolutional Neural Networks (CNN) are similar to Multi-layer Perceptron Neural Networks. They are also made up of neurons that have learnable weights and biases. CNNs have been successfully applied to analyzing visual imagery. They are mostly being applied in image and video recognition, recommender systems and natural language processing.

A CNN consists of multiple hidden layers. The hidden layers are either convolutional, pooling or fully connected.

Convolution layer: Feature extraction is done in this layer. This layer applies convolution operation to the input and pass the result to the next layer. In the image classification problem, a weight matrix is defined in the convolution layer. A dot product is computed between the weight matrix and a small part (as the size of the weight matrix) of the input image. The weight runs across the image such that all the pixels are covered at least once, to give a convolved output.

The weight matrix behaves like a filter in an image extracting particular information from the original image matrix.

A weight combination might be extracting edges, while another one might a particular color, while another one might just blur the unwanted noise.

The weights are learnt such that the loss function is minimized similar to a Multi-layer Perceptron.

Therefore weights are learnt to extract features from the original image which help the network in correct prediction.

When we have multiple convolutional layers, the initial layer extract more generic features, while as the network gets deeper, the features extracted by the weight matrices are more and more complex and more suited to the problem at hand.

Reference: Architecture of Convolutional Neural Networks (CNNs) demystified

Stride: While computing the dot product, if the weight matrix moves 1 pixel at a time then we call it a stride of 1. Size of the image keeps on reducing as we increase the stride value.

Padding: Padding one or more layer of zeros across the image helps to resolve the output image size reduction issue caused by stride. Initial size of the image is retained after the padding is done.

Pooling layer: Reduction in number of feature parameters is done in this layer. When the image size is too larger, then we need a pooling layer in-between two convolution layers. This layer helps to reduce the number of trainable parameters of the input image. The sole purpose of pooling is to reduce the spatial size of the image. This layer is also used to control overfitting.

  • Max pooling: Uses maximum value from each of the cluster of the prior layer
  • Average pooling: Uses the average value from each of the cluster of the prior layer

Fully connected layer: This layer comes after convolution and pooling layers. This layer connects each neuron in one layer to every neuron in another layer. This is similar to the concept of layer connection of Multi-layer perceptron model. Error is computed in the output layer by computing the difference in actual output and predicted output. After that, back-propagation is used to update the weight and biases for error and loss reduction.

Load train and test data

Let's again load the train and test datasets.


In [29]:
train = pd.read_csv('train.csv')
print (train.shape)
train.head()


(42000, 785)
Out[29]:
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 785 columns


In [30]:
test = pd.read_csv('test.csv')
print (test.shape)
test.head()


(28000, 784)
Out[30]:
pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 784 columns


In [31]:
y_train = train['label']
X_train = train.drop(labels=['label'], axis=1)
X_test = test

Get values of data


In [32]:
X_train = X_train.values.astype('float32') # pixel values of all images in train set
y_train = y_train.values.astype('int32') # labels of all images
X_test = test.values.astype('float32') # pixel values of all images in test set

View shape and content of data


In [33]:
print (X_train.shape)
print (y_train.shape)
print (X_train[1])


(42000, 784)
(42000,)
[   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.   18.   30.  137.  137.  192.   86.   72.    1.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.   13.   86.  250.  254.  254.  254.  254.  217.
  246.  151.   32.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.   16.  179.  254.  254.  254.
  254.  254.  254.  254.  254.  254.  231.   54.   15.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.   72.
  254.  254.  254.  254.  254.  254.  254.  254.  254.  254.  254.  254.
  104.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.   61.  191.  254.  254.  254.  254.  254.  109.   83.  199.
  254.  254.  254.  254.  243.   85.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.  172.  254.  254.  254.  202.  147.
  147.   45.    0.   11.   29.  200.  254.  254.  254.  171.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    1.  174.  254.
  254.   89.   67.    0.    0.    0.    0.    0.    0.  128.  252.  254.
  254.  212.   76.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.   47.  254.  254.  254.   29.    0.    0.    0.    0.    0.    0.
    0.    0.   83.  254.  254.  254.  153.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.   80.  254.  254.  240.   24.    0.    0.
    0.    0.    0.    0.    0.    0.   25.  240.  254.  254.  153.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.   64.  254.  254.
  186.    7.    0.    0.    0.    0.    0.    0.    0.    0.    0.  166.
  254.  254.  224.   12.    0.    0.    0.    0.    0.    0.    0.    0.
   14.  232.  254.  254.  254.   29.    0.    0.    0.    0.    0.    0.
    0.    0.    0.   75.  254.  254.  254.   17.    0.    0.    0.    0.
    0.    0.    0.    0.   18.  254.  254.  254.  254.   29.    0.    0.
    0.    0.    0.    0.    0.    0.    0.   48.  254.  254.  254.   17.
    0.    0.    0.    0.    0.    0.    0.    0.    2.  163.  254.  254.
  254.   29.    0.    0.    0.    0.    0.    0.    0.    0.    0.   48.
  254.  254.  254.   17.    0.    0.    0.    0.    0.    0.    0.    0.
    0.   94.  254.  254.  254.  200.   12.    0.    0.    0.    0.    0.
    0.    0.   16.  209.  254.  254.  150.    1.    0.    0.    0.    0.
    0.    0.    0.    0.    0.   15.  206.  254.  254.  254.  202.   66.
    0.    0.    0.    0.    0.   21.  161.  254.  254.  245.   31.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.   60.  212.
  254.  254.  254.  194.   48.   48.   34.   41.   48.  209.  254.  254.
  254.  171.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.   86.  243.  254.  254.  254.  254.  254.  233.  243.
  254.  254.  254.  254.  254.   86.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.  114.  254.  254.  254.
  254.  254.  254.  254.  254.  254.  254.  239.   86.   11.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   13.  182.  254.  254.  254.  254.  254.  254.  254.  254.  243.   70.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    8.   76.  146.  254.  255.  254.  255.
  146.   19.   15.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.]

Normalizing input values

As we can see above, the pixel values for each image are gray scaled between 0 and 255. We now, normalize those values from 0-255 to 0-1.


In [34]:
# pixel values are gray scale between 0 and 255
# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
X_test = X_test / 255
print (X_train[1])


[ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.07  0.12  0.54  0.54  0.75  0.34  0.28
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.05  0.34  0.98  1.    1.    1.    1.
  0.85  0.96  0.59  0.13  0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.06  0.7   1.    1.    1.    1.
  1.    1.    1.    1.    1.    0.91  0.21  0.06  0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.28  1.    1.    1.
  1.    1.    1.    1.    1.    1.    1.    1.    1.    0.41  0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.24  0.75
  1.    1.    1.    1.    1.    0.43  0.33  0.78  1.    1.    1.    1.
  0.95  0.33  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.67  1.    1.    1.    0.79  0.58  0.58  0.18  0.    0.04  0.11
  0.78  1.    1.    1.    0.67  0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.68  1.    1.    0.35  0.26  0.    0.    0.    0.
  0.    0.    0.5   0.99  1.    1.    0.83  0.3   0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.18  1.    1.    1.    0.11  0.    0.    0.
  0.    0.    0.    0.    0.    0.33  1.    1.    1.    0.6   0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.31  1.    1.    0.94  0.09
  0.    0.    0.    0.    0.    0.    0.    0.    0.1   0.94  1.    1.    0.6
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.25  1.    1.
  0.73  0.03  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.65
  1.    1.    0.88  0.05  0.    0.    0.    0.    0.    0.    0.    0.
  0.05  0.91  1.    1.    1.    0.11  0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.29  1.    1.    1.    0.07  0.    0.    0.    0.    0.    0.
  0.    0.    0.07  1.    1.    1.    1.    0.11  0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.19  1.    1.    1.    0.07  0.    0.    0.    0.
  0.    0.    0.    0.    0.01  0.64  1.    1.    1.    0.11  0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.19  1.    1.    1.    0.07  0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.37  1.    1.    1.    0.78
  0.05  0.    0.    0.    0.    0.    0.    0.    0.06  0.82  1.    1.
  0.59  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.06
  0.81  1.    1.    1.    0.79  0.26  0.    0.    0.    0.    0.    0.08
  0.63  1.    1.    0.96  0.12  0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.24  0.83  1.    1.    1.    0.76  0.19  0.19  0.13
  0.16  0.19  0.82  1.    1.    1.    0.67  0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.34  0.95  1.    1.    1.    1.
  1.    0.91  0.95  1.    1.    1.    1.    1.    0.34  0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.45  1.    1.
  1.    1.    1.    1.    1.    1.    1.    1.    0.94  0.34  0.04  0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.05  0.71  1.    1.    1.    1.    1.    1.    1.    1.    0.95  0.27
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.03  0.3   0.57  1.    1.    1.    1.    0.57
  0.07  0.06  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.  ]

Converting target variable values into one-hot format

The output/target variable is in the format 0 to 9. As this is a multi-class classification problem, we convert the output class values into one-hot format which is simply a binary matrix, i.e.

value 0 will be converted to one-hot format as [1, 0, 0, 0, 0, 0, 0, 0, 0]

value 1 will be converted to one-hot format as [0, 1, 0, 0, 0, 0, 0, 0, 0]

value 2 will be converted to one-hot format as [0, 0, 1, 0, 0, 0, 0, 0, 0]

and so on...


In [35]:
print (y_train.shape)
print (y_train[0])


(42000,)
1

In [36]:
# one hot encode outputs
# note that we have new variables with capital Y
# Y_train is different than y_train
Y_train = np_utils.to_categorical(y_train)
num_classes = Y_train.shape[1]

In [37]:
print (y_train.shape, Y_train.shape)
print (y_train[0], Y_train[0])


((42000,), (42000, 10))
(1, array([ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]))

Splitting train dataset into training and validation set

We split the train dataset into two parts in 9:1 ratio. 90% will be the actual training set and the remaining 10% will be the validation/testing set. We train our model using the training set and test the accuracy of the model using the validation set.


In [38]:
# Split the entire training set into two separate sets: Training set and Validation set
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.10, random_state=random_seed)

In [39]:
print (X_train.shape, Y_train.shape, X_val.shape, Y_val.shape)
num_pixels = X_train.shape[1]


((37800, 784), (37800, 10), (4200, 784), (4200, 10))

In [40]:
print (Y_val)
# converting one-hot format of digits to normal values/labels
y_val = np.argmax(Y_val, 1) # reverse of to_categorical
print (y_val)
# Note that: capital Y_val contains values in one-hot format and small y_val contains normal digit values


[[ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]]
[1 1 4 ..., 0 2 2]

Reshaping images

The image dimension expected by Keras for 2D (two-dimensional) convolution is in the format of [pixels][width][height].

For RGB color image, the first dimension (pixel) value would be 3 for the red, green and blue components. It's like having 3 image inputs for every single color image. In our case (for MNIST handwritten images), we have gray scale images. Hence, the pixel dimension is set as 1.


In [41]:
# reshape to be [samples][pixels][width][height]
X_train = X_train.reshape(X_train.shape[0], 1, 28, 28).astype('float32')
X_test = X_test.reshape(X_test.shape[0], 1, 28, 28).astype('float32')
X_val = X_val.reshape(X_val.shape[0], 1, 28, 28).astype('float32')

print (num_pixels, X_train.shape, X_test.shape, X_val.shape)


(784, (37800, 1, 28, 28), (28000, 1, 28, 28), (4200, 1, 28, 28))

In [42]:
print (X_train[1])


[[[ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.3   0.99  0.4   0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.79  0.99  0.85  0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.76  0.99  0.85  0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.99  0.99  0.85  0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.99  0.99  0.85  0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.99  0.99  0.85  0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.99  0.99  0.85  0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.18  0.99  0.99  0.67  0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.67  0.99  0.99  0.38  0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.95  0.99  0.9   0.02  0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.43  0.99  1.    0.43  0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.47  0.99  0.99  0.42  0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.9   0.99  0.96  0.16  0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.18
    0.94  0.99  0.63  0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.27
    0.96  0.99  0.47  0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.38
    0.99  0.99  0.47  0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.38
    0.99  0.99  0.32  0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.78
    0.99  0.99  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.85
    0.99  0.56  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.62
    0.99  0.07  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]
  [ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.  ]]]

Define Convolutional Neural Network (CNN) Model

Convolution Layer

  • We define 32 feature maps with the size of 5x5 matrix
  • We use ReLU (Rectified Linear Units) as the activation function
  • This layer expects input image size of 1x28x28 ([pixels][height][weight])

Max Pooling Layer

  • It has a pool size of 2x2

Dropout Layer

  • Configured to randomly exclude 20% of neurons in the layer to reduce overfitting

Flatten

  • Flattens the image into a single dimensional vector which is required as input by the fully connected layer

Fully connected Layer

  • Contains 128 neurons
  • relu is used as an activation function
  • Output layer has num_classes=10 neurons for the 10 classes
  • softmax activation function is used in the output layer
  • adam gradient descent algorithm is used as optimizer to learn and update weights

In [43]:
# baseline model for CNN
def baseline_model():
    # create model    
    model = Sequential()    
    model.add(Conv2D(32, (5, 5), input_shape=(1, 28, 28), activation='relu'))    
    model.add(MaxPooling2D(pool_size=(2, 2)))    
    model.add(Dropout(0.2))
    model.add(Flatten())    
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))    
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

To compile the model, there are different optimizers present in Keras like Stochastic Gradient Descent optimizer, Adam optimizer, RMSprop optimizer, etc.


In [44]:
# Example of using RMSprop optimizer
#from keras.optimizers import RMSprop, SGD
#model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001), metrics=['accuracy'])
#model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.001), metrics=['accuracy'])

Fit and Evaluate Model

The model is fit over 5 epochs/iteration. It takes a batch of 200 images in each iteration. Validation data is used as validation set. The epochs may be increased to improve accuracy.

Finally, validation data is used to evaluate the model by calculating the model's classification accuracy.


In [45]:
model = baseline_model()
history = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=5, batch_size=200, verbose=1)


Train on 37800 samples, validate on 4200 samples
Epoch 1/5
37800/37800 [==============================] - 372s - loss: 0.3050 - acc: 0.9128 - val_loss: 0.1149 - val_acc: 0.9693
Epoch 2/5
37800/37800 [==============================] - 345s - loss: 0.0981 - acc: 0.9710 - val_loss: 0.0854 - val_acc: 0.9767
Epoch 3/5
37800/37800 [==============================] - 336s - loss: 0.0656 - acc: 0.9799 - val_loss: 0.0565 - val_acc: 0.9852
Epoch 4/5
37800/37800 [==============================] - 337s - loss: 0.0493 - acc: 0.9846 - val_loss: 0.0570 - val_acc: 0.9848
Epoch 5/5
37800/37800 [==============================] - 337s - loss: 0.0430 - acc: 0.9863 - val_loss: 0.0546 - val_acc: 0.9862

In [46]:
history_dict = history.history
history_dict.keys()


Out[46]:
['acc', 'loss', 'val_acc', 'val_loss']

In [47]:
plt.figure(figsize=[10,4])

plt.subplot(121)
plt.plot(range(1, len(history_dict['val_acc'])+1), history_dict['val_acc'])
plt.xlabel('Epochs')
plt.ylabel('Accuracy')

plt.subplot(122)
plt.plot(range(1, len(history_dict['val_loss'])+1), history_dict['val_loss'])
plt.xlabel('Epochs')
plt.ylabel('Loss')


Out[47]:
<matplotlib.text.Text at 0x7f8897019710>

In [48]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d_1 (Conv2D)            (None, 32, 24, 24)        832       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 32, 12, 12)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 12, 12)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 4608)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               589952    
_________________________________________________________________
dense_4 (Dense)              (None, 10)                1290      
=================================================================
Total params: 592,074
Trainable params: 592,074
Non-trainable params: 0
_________________________________________________________________

In [49]:
scores = model.evaluate(X_val, Y_val, verbose=0)
print (scores)
print ('Score: {}'.format(scores[0]))
print ('Accuracy: {}'.format(scores[1]))


[0.054642491417360446, 0.98619047619047617]
Score: 0.0546424914174
Accuracy: 0.98619047619

Accuracy (98.61%) of Convolution Neural Network (CNN) model has improved as compared to the accuracy (97.95%) of Multi-layer Perceptron (MLP) model.

The accuracy of CNN model can be further increased by:

  • increasing the epoch number while fitting the model
  • adding more convolution and pooling layers to the model

Plot correctly and incorrectly predicted images

Let's plot some images which are correctly predicted and some images which are incorrectly predicted on our test dataset.


In [50]:
# get predicted values
predicted_classes = model.predict_classes(X_val)


4200/4200 [==============================] - 24s    

In [51]:
# get index list of all correctly predicted values
correct_indices = np.nonzero(np.equal(predicted_classes, y_val))[0]

# get index list of all incorrectly predicted values
incorrect_indices = np.nonzero(np.not_equal(predicted_classes, y_val))[0]

In [52]:
print ('Correctly predicted: %i' % np.size(correct_indices))
print ('Incorrectly predicted: %i' % np.size(incorrect_indices))


Correctly predicted: 4142
Incorrectly predicted: 58

In [53]:
plt.figure(figsize=[20,8])
for i, correct in enumerate(correct_indices[:6]):
    plt.subplot(1,6,i+1)
    plt.imshow(X_val[correct].reshape(28,28), cmap='gray', interpolation='none')
    plt.title("Predicted {}, Class {}".format(predicted_classes[correct], y_val[correct]))
    
plt.figure(figsize=[20,8])
for i, incorrect in enumerate(incorrect_indices[:6]):
    plt.subplot(1,6,i+1)
    plt.imshow(X_val[incorrect].reshape(28,28), cmap='gray', interpolation='none')
    plt.title("Predicted {}, Class {}".format(predicted_classes[incorrect], y_val[incorrect]))


Confusion Matrix


In [54]:
# we have digit labels from 0 to 9
# we can either manually create a class variable with those labels
# class_names = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# or, we can take unique values from train dataset's labels
class_names = np.unique(y_train)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_val, predicted_classes)
np.set_printoptions(precision=2)

print ('Confusion Matrix in Numbers')
print (cnf_matrix)
print ('')

cnf_matrix_percent = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]

print ('Confusion Matrix in Percentage')
print (cnf_matrix_percent)
print ('')

true_class_names = class_names
predicted_class_names = class_names

df_cnf_matrix = pd.DataFrame(cnf_matrix, 
                             index = true_class_names,
                             columns = predicted_class_names)

df_cnf_matrix_percent = pd.DataFrame(cnf_matrix_percent, 
                                     index = true_class_names,
                                     columns = predicted_class_names)

plt.figure(figsize = (8,6))

#plt.subplot(121)
ax = sns.heatmap(df_cnf_matrix, annot=True, fmt='d')
ax.set_ylabel('True values')
ax.set_xlabel('Predicted values')
ax.set_title('Confusion Matrix in Numbers')

'''
plt.subplot(122)
ax = sns.heatmap(df_cnf_matrix_percent, annot=True)
ax.set_ylabel('True values')
ax.set_xlabel('Predicted values')
'''


Confusion Matrix in Numbers
[[428   0   0   1   0   0   0   0   0   0]
 [  0 494   2   0   0   0   0   2   0   0]
 [  0   0 426   0   2   0   0   4   1   0]
 [  1   0   0 436   0   1   0   2   3   1]
 [  1   0   0   0 390   0   2   2   0   4]
 [  1   0   0   2   0 346   0   0   0   0]
 [  1   0   0   0   2   1 401   0   0   1]
 [  1   0   1   0   1   0   0 437   0   3]
 [  1   2   1   0   0   0   1   0 394   2]
 [  1   0   2   0   2   0   0   3   0 390]]

Confusion Matrix in Percentage
[[ 1.    0.    0.    0.    0.    0.    0.    0.    0.    0.  ]
 [ 0.    0.99  0.    0.    0.    0.    0.    0.    0.    0.  ]
 [ 0.    0.    0.98  0.    0.    0.    0.    0.01  0.    0.  ]
 [ 0.    0.    0.    0.98  0.    0.    0.    0.    0.01  0.  ]
 [ 0.    0.    0.    0.    0.98  0.    0.01  0.01  0.    0.01]
 [ 0.    0.    0.    0.01  0.    0.99  0.    0.    0.    0.  ]
 [ 0.    0.    0.    0.    0.    0.    0.99  0.    0.    0.  ]
 [ 0.    0.    0.    0.    0.    0.    0.    0.99  0.    0.01]
 [ 0.    0.    0.    0.    0.    0.    0.    0.    0.98  0.  ]
 [ 0.    0.    0.01  0.    0.01  0.    0.    0.01  0.    0.98]]

Out[54]:
"\nplt.subplot(122)\nax = sns.heatmap(df_cnf_matrix_percent, annot=True)\nax.set_ylabel('True values')\nax.set_xlabel('Predicted values')\n"

Using Multi-layer Perceptron (MLP) Model, we had the following heatmap outcome:

  • Most of value 2 were predicted as 7. 6 images of digit 2 were predicted as 7.
  • Similarly, 6 images of digit 9 were predicted as 7.
  • The third highest wrong prediction was of number 5. 5 images of digit 5 were predicted as 3.

Using Convolutional Neural Network (CNN) Model, we had the following improvements:

  • Number 2 predicted as 7 has been reduced from 6 to 4.
  • Number 9 predicted as 7 has been reduced from 6 to 3.
  • Number 5 predicted as 3 has been reduced from 5 to 2.

The accuracy of CNN model can be further increased by:

  • increasing the epoch/iteration number while fitting the model
  • adding more convolution and pooling layers to the model

Improving accuracy using multiple CNN layer

Let's try adding multiple convolution layers (Conv2D) and multiple fully-connected layers (Dense) as well.

The second Convolution layer will have 15 filters with the size of 3x3 matrix.

The second fully-connected layer will have 50 neurons.

We also use 10 epochs this time instead of 5.


In [55]:
def baseline_model():
    # create model
    model = Sequential()
    
    model.add(Conv2D(filters=32, kernel_size=(5, 5), input_shape=(1, 28, 28), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    model.add(Flatten())
    
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.25))
    
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(num_classes, activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [56]:
# build the model
model = baseline_model()

# fit the model
history = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=200)


Train on 37800 samples, validate on 4200 samples
Epoch 1/10
37800/37800 [==============================] - 554s - loss: 0.3512 - acc: 0.8853 - val_loss: 0.0771 - val_acc: 0.9783
Epoch 2/10
37800/37800 [==============================] - 547s - loss: 0.0891 - acc: 0.9722 - val_loss: 0.0565 - val_acc: 0.9840
Epoch 3/10
37800/37800 [==============================] - 527s - loss: 0.0657 - acc: 0.9793 - val_loss: 0.0449 - val_acc: 0.9871
Epoch 4/10
37800/37800 [==============================] - 533s - loss: 0.0504 - acc: 0.9841 - val_loss: 0.0491 - val_acc: 0.9867
Epoch 5/10
37800/37800 [==============================] - 526s - loss: 0.0466 - acc: 0.9848 - val_loss: 0.0362 - val_acc: 0.9895
Epoch 6/10
37800/37800 [==============================] - 527s - loss: 0.0369 - acc: 0.9881 - val_loss: 0.0339 - val_acc: 0.9888
Epoch 7/10
37800/37800 [==============================] - 528s - loss: 0.0327 - acc: 0.9891 - val_loss: 0.0443 - val_acc: 0.9883
Epoch 8/10
37800/37800 [==============================] - 529s - loss: 0.0314 - acc: 0.9901 - val_loss: 0.0440 - val_acc: 0.9881
Epoch 9/10
37800/37800 [==============================] - 527s - loss: 0.0285 - acc: 0.9908 - val_loss: 0.0377 - val_acc: 0.9907
Epoch 10/10
37800/37800 [==============================] - 524s - loss: 0.0252 - acc: 0.9917 - val_loss: 0.0484 - val_acc: 0.9883

In [57]:
history_dict = history.history
history_dict.keys()


Out[57]:
['acc', 'loss', 'val_acc', 'val_loss']

In [58]:
plt.figure(figsize=[10,4])

plt.subplot(121)
plt.plot(range(1, len(history_dict['val_acc'])+1), history_dict['val_acc'])
plt.xlabel('Epochs')
plt.ylabel('Accuracy')

plt.subplot(122)
plt.plot(range(1, len(history_dict['val_loss'])+1), history_dict['val_loss'])
plt.xlabel('Epochs')
plt.ylabel('Loss')


Out[58]:
<matplotlib.text.Text at 0x7f886c4c76d0>

In [59]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d_2 (Conv2D)            (None, 32, 24, 24)        832       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 32, 12, 12)        0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 32, 12, 12)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 64, 10, 10)        18496     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 64, 5, 5)          0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 64, 5, 5)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 512)               819712    
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1024)              525312    
_________________________________________________________________
dropout_5 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                10250     
=================================================================
Total params: 1,374,602
Trainable params: 1,374,602
Non-trainable params: 0
_________________________________________________________________

In [60]:
scores = model.evaluate(X_val, Y_val, verbose=0)
print (scores)
print ('Score: {}'.format(scores[0]))
print ('Accuracy: {}'.format(scores[1]))


[0.048411163999255567, 0.98833333333333329]
Score: 0.0484111639993
Accuracy: 0.988333333333

Accuracy has improved from 98.61% to 98.83%.

Submission to Kaggle


In [ ]:
# get predicted values for test dataset
predicted_classes = model.predict_classes(X_test)

submissions = pd.DataFrame({'ImageId': list(range(1, len(predicted_classes) + 1)), 
                            "Label": predicted_classes})

submissions.to_csv("submission.csv", index=False, header=True)


20544/28000 [=====================>........] - ETA: 60s